GSE143290 a mouse bulk RNASeq experiment on the
small intenstine infection vs. control group as our reference data-set
for power analysis.Trex1 and
Ifrd11 and their related genes (identified by
string-db.org) as the list of genes we wanted to have enough statistical
power to test for in an experiment; and what is the minimum number of
samples we needed to reach this sufficient power (>80%).n>=13 as the threshold to
reach sufficient power for highly and medium expressed genes;
n>=21 as the threshold for lowly expressed genes.# file parsing and cleaning
d <- read.delim("/Users/paulcao/Downloads/GSE143290_SI_study_RawCounts.txt", header = TRUE,
sep = "\t")
dataMat <- d[, c(8, 9, 10, 11, 12, 13)]
rownames(dataMat) <- d$Geneid
colnames(dataMat) <- c("VAS_Control_1", "VAS_Control_2", "VAS_Control_3", "VAS_Infected_1",
"VAS_Infected_2", "VAS_Infected_3")
head(dataMat)
## VAS_Control_1 VAS_Control_2 VAS_Control_3 VAS_Infected_1 VAS_Infected_2
## Xkr4 0 3 5 2 4
## Rp1 0 0 2 0 0
## Sox17 31 63 34 89 34
## Mrpl15 2081 1767 2733 1624 1963
## Lypla1 7946 7399 7717 7515 6790
## Tcea1 2885 2237 2808 2003 2558
## VAS_Infected_3
## Xkr4 5
## Rp1 2
## Sox17 35
## Mrpl15 1671
## Lypla1 6798
## Tcea1 2305
RNAseqsamplesize [@Zhao2018] was used to do the analysis. With expected fold change between groups = 2, FDR set = 0.01 and number of samples = 5, power was computed to be 0.086 for the selected genes of interest which is very low. Some gene IDs pulled from Biomart are not in the dataset because they are alleles on alternative sequences (i.e. rapsn and Fbxo32) Dispersion was also computed as 0.04754.
# estimate gene read count and dispersion distribution
distribution <- est_count_dispersion(dataMat, group = c(0, 0, 0, 1, 1, 1))
## Disp = 0.05714 , BCV = 0.239
Gene.name <- c("Trex1", "Ifrd1", "Ddost", "Rpn1", "Stt3b", "Krtcap2", "Tmem258",
"Ostc", "Rnaseh2a", "Dad1", "Tusc3", "Stt3a", "Ankmy2", "Mef2c", "Ifnl2", "Sap30",
"Fau", "Gm9843", "Dixdc1", "Bhlhb9", "Hdac1", "Sin3b")
Gene.stable.ID <- sprintf("Gene % d", 1:22)
genelist <- data.frame(Gene.name, Gene.stable.ID)
dim(genelist)
## [1] 22 2
power <- est_power_distribution(n = 5, f = 0.01, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power$power) # 0.217285
## [1] 0.217285
From changing around FDR (fdr parameter in
est_power_curve()) and coverage (lambda0
parameter in est_power_curve()) as well as making an
optimization plot, it appears that at least 10 samples are needed with
an average coverage > 10 reads/gene depending on desired FDR in order
to achieve >80% power. To be precise, 10 samples with 10 coverage and
FDR=0.05 will give 82% power. To be safe, for the genes of interest we
would need at least 13 samples to achieve 80% power.
Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.
Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.
est_power(n = 8, lambda0 = 20, phi0 = 0.07154, f = 0.05, m = 52000, m1 = 71)
## [1] 0.51
power10 <- est_power_distribution(n = 10, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power10$power)
## [1] 0.8668957
power13 <- est_power_distribution(n = 13, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power13$power)
## [1] 0.9587752
power15 <- est_power_distribution(n = 15, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power15$power)
## [1] 0.9817761
power18 <- est_power_distribution(n = 18, f = 0.05, rho = 2, minAveCount = 15, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power18$power)
## [1] 0.9932155
gene_readcounts <- distribution$pseudo.counts.mean[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
gene_dispersions <- distribution$tagwise.dispersion[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
gene_readcounts
## Ankmy2 Ifrd1 Mef2c Dad1 Fau Tmem258 Krtcap2
## 921.06499 1856.91929 225.89718 2459.88159 9697.99371 1441.02956 980.94882
## Ostc Hdac1 Ddost Rpn1 Tusc3 Sap30 Sin3b
## 4170.09639 2309.83045 6186.40215 5731.93796 422.96087 253.98447 1791.29028
## Rnaseh2a Stt3a Dixdc1 Trex1 Stt3b Bhlhb9
## 451.07636 4412.36483 71.54271 333.20858 5433.68299 80.60489
mean(gene_readcounts)
## [1] 2461.636
mean(gene_dispersions)
## [1] 0.05031733
ggplot(data=data.frame(counts=as.numeric(gene_readcounts),name=as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)]))) + geom_bar(aes(x=reorder(name,-counts),y=counts),stat="identity") + scale_y_log10() + coord_flip() + xlab("Gene name") + ylab("Mean count per gene across samples") + theme(axis.text.y = element_text(size=7)) + ggtitle("Mean count per gene across samples from experimental data") + labs(caption="Each sample has approximately 50 million reads. Experiment ID GSE58669")
gene_readcounts
## Ankmy2 Ifrd1 Mef2c Dad1 Fau Tmem258 Krtcap2
## 921.06499 1856.91929 225.89718 2459.88159 9697.99371 1441.02956 980.94882
## Ostc Hdac1 Ddost Rpn1 Tusc3 Sap30 Sin3b
## 4170.09639 2309.83045 6186.40215 5731.93796 422.96087 253.98447 1791.29028
## Rnaseh2a Stt3a Dixdc1 Trex1 Stt3b Bhlhb9
## 451.07636 4412.36483 71.54271 333.20858 5433.68299 80.60489
ss<-sample_size_distribution(f=0.05,distributionObject=distribution,selectedGenes=genelist$Gene.name,storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
ss
## $iter
## [1] 7
##
## $f.root
## [1] 0.05291029
##
## $root
## [1] 10
##
## $process
## N Power
## [1,] 1 0.007816436
## [2,] 9 0.791027221
## [3,] 10 0.852910286
## [4,] 11 0.897237676
## [5,] 13 0.953507915
## [6,] 17 0.989958990
## [7,] 33 0.996009604
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 5e-02 1e+00 2e+00 1e+00
names <- as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)])
to_drop <- names[which(gene_readcounts < 10)]
ss_drop10 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=10)),storeProcess = TRUE)
# 76.7% power with N=9, 83.1% power with N=10
ss_drop10
## $iter
## [1] 7
##
## $f.root
## [1] 0.04509534
##
## $root
## [1] 9
##
## $process
## N Power
## [1,] 1 0.0185601
## [2,] 5 0.4384431
## [3,] 7 0.6838682
## [4,] 8 0.7798710
## [5,] 9 0.8450953
## [6,] 17 0.9928774
## [7,] 33 0.9960096
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
to_drop80 <- names[which(gene_readcounts < 80)]
ss_drop80 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
# basically when rho goes up and genes go down it works
# so what exactly is rho?
# basically effect size... so question: do we expect it to be bigger or smaller than in the 2 group situation?
ss_drop80
## $iter
## [1] 7
##
## $f.root
## [1] 0.04680084
##
## $root
## [1] 9
##
## $process
## N Power
## [1,] 1 0.01864329
## [2,] 5 0.44028725
## [3,] 7 0.68598219
## [4,] 8 0.78185105
## [5,] 9 0.84680084
## [6,] 17 0.99300038
## [7,] 33 0.99600921
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
to_drop200 <- names[which(gene_readcounts < 200)]
ss_drop200 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=200)),storeProcess = TRUE)
ss_drop80_fold2.5 <- sample_size_distribution(f=0.1,rho=2.5,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
ss_drop80_fold2.5
## $iter
## [1] 7
##
## $f.root
## [1] 0.04128189
##
## $root
## [1] 5
##
## $process
## N Power
## [1,] 1 0.05840905
## [2,] 3 0.49249287
## [3,] 4 0.70251460
## [4,] 5 0.84128189
## [5,] 9 0.99056980
## [6,] 17 0.99601169
## [7,] 33 0.99600857
##
## $parameters
## power m m1 fdr w rho k
## 0.8 10000.0 100.0 0.1 1.0 2.5 1.0
kept <- append(rownames(genelist[grepl("Trex[0-9]+$", genelist$Gene.name), ]), rownames(genelist[grepl("Ifrd[0-9]+$", genelist$Gene.name), ]))
genelist[kept,]$Gene.name[which(!genelist[kept,]$Gene.stable.ID %in% names(distribution$pseudo.counts.mean))]
## [1] "Trex1" "Ifrd1"
kept_final <- kept[which(genelist[kept,]$Gene.name %in% names(distribution$pseudo.counts.mean))]
power_kept <- est_power_distribution(n=6,f=0.1,m=52000,m1=4000,distributionObject=distribution,selectedGenes=c("Trex1", "Ifrd1"),storeProcess = TRUE)
mean(power_kept$power) #32% power with just kept genes and 6 samples each group
## [1] 0.8259586
ss_kept <- sample_size_distribution(f=0.1,rho=2,distributionObject=distribution,selectedGenes=c("Trex1", "Ifrd1"),storeProcess=TRUE)
ss_kept
## $iter
## [1] 7
##
## $f.root
## [1] 0.04720392
##
## $root
## [1] 9
##
## $process
## N Power
## [1,] 1 0.01867239
## [2,] 5 0.44064713
## [3,] 7 0.68643015
## [4,] 8 0.78229602
## [5,] 9 0.84720392
## [6,] 17 0.99304494
## [7,] 33 0.99600599
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
nrow(found)
## [1] 20
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
## Gene.name Gene.stable.ID
## 1 Trex1 Gene 1
## 2 Ifrd1 Gene 2
## 3 Ddost Gene 3
## 4 Rpn1 Gene 4
## 5 Stt3b Gene 5
## 6 Krtcap2 Gene 6
## 7 Tmem258 Gene 7
## 8 Ostc Gene 8
## 9 Rnaseh2a Gene 9
## 10 Dad1 Gene 10
## 11 Tusc3 Gene 11
## 12 Stt3a Gene 12
## 13 Ankmy2 Gene 13
## 14 Mef2c Gene 14
## 16 Sap30 Gene 16
## 17 Fau Gene 17
## 19 Dixdc1 Gene 19
## 20 Bhlhb9 Gene 20
## 21 Hdac1 Gene 21
## 22 Sin3b Gene 22
n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)
# Trex1:
musk_id <- which(found$Gene.name=='Trex1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
# Ifrd1:
ifrd1_id <- which(found$Gene.name=='Ifrd1')
ifrd1_power <- sapply(gene_powers, function(x) x[ifrd1_id])
total_power <- sapply(gene_powers, function(x) mean(x))
total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))
# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))
# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))
# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,ifrd1_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,ifrd1_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","ifrd1_power","total_power","gene_powers_random"),labels=c("Trex1","Ifrd1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))
avg_count <- transmute(dataMat,m1=(Innervated_WT_1+Innervated_WT_2)/2,m2=(Denervated_WT_1+Denervated_WT_2)/2)
log2_fold <- log2(avg_count$m2/avg_count$m1)>2
total_de<-sum(log2_fold[!is.na(log2_fold)]) # about 4000
library(PROPER)
ngenes = nrow(dataMat)
simOptions = RNAseq.SimOptions.2grp(ngenes=ngenes,lBaselineExpr="bottomly",lOD="bottomly")
simres = runSims(sim.opts=simOptions,nsims=20)
powers = comparePower(simres, alpha.type='fdr', alpha.nominal=0.1, stratify.by='expr')
library(fitdistrplus)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: survival
samples <- as.numeric(array(distribution$pseudo.counts.mean))
samples <- as.integer(samples)
negbinom <- fitdist(samples, "nbinom")
summary(negbinom)
## Fitting of the distribution ' nbinom ' by maximum likelihood
## Parameters :
## estimate Std. Error
## size 0.3214267 0.002736276
## mu 1401.5124212 21.297744213
## Loglikelihood: -133454.8 AIC: 266913.6 BIC: 266929.1
## Correlation matrix:
## size mu
## size 1.000000e+00 7.759517e-05
## mu 7.759517e-05 1.000000e+00
quantile(negbinom)
## Estimated quantiles for each specified probability (non-censored data)
## p=0.1 p=0.2 p=0.3 p=0.4 p=0.5 p=0.6 p=0.7 p=0.8 p=0.9
## estimate 2 21 74 184 381 709 1250 2184 4098
plot(negbinom)
Based on the quantiles as calculated by the fitted negative binomial distribution, we can bin the genes of interest into the following three bins:
Highly expressed genes (p >0.7 or gene count > 1250):
c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc")
Medium expressioned genes (0.4<p<0.7)
c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3")
Lowly expressioned genes (p<0.4):
c("Dixdc1", "Bhlhb9") plus some randomly selected genes
"c(Sox17", "Xkr4", "A830018L16Rik") to get to a total of 5
genes.
n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)
gene_powers2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, maxAveCount = 50000, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers2,function(x) sum(x>0.8))
gene_powers_keptfinal2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)
gene_powers3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Dixdc1", "Bhlhb9", "Sox17", "Xkr4", "A830018L16Rik"), storeProcess=TRUE)$power)
#num_genes <- sapply(gene_powers3,function(x) sum(x>0.8))
#gene_powers_keptfinal3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Dixdc1", "Bhlhb9"),storeProcess=TRUE)$power)
# Trex1:
musk_id <- which(found$Gene.name=='Trex1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
# Ifrd1:
ifrd1_id <- which(found$Gene.name=='Ifrd1')
ifrd1_power <- sapply(gene_powers, function(x) x[ifrd1_id])
total_power_high <- sapply(gene_powers, function(x) mean(x))
total_power_medium <- sapply(gene_powers2, function(x) mean(x))
total_power_low <- sapply(gene_powers3, function(x) mean(x))
total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))
# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))
# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,total_power_high,total_power_medium,total_power_low,gene_powers_random) %>% tidyr::gather("type","power", total_power_high,total_power_medium,total_power_low,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for Highly, Medium and Lowly Expressed genes and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("total_power_high","total_power_medium","total_power_low","gene_powers_random"),labels=c("Highly Expressed Genes", "Medium Expressed Genes","Lowly Expressed Genes","Random Genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))
library(fitdistrplus)
samples <- as.numeric(array(distribution$pseudo.counts.mean))
samples <- as.integer(samples)
negbinom <- fitdist(samples, "nbinom")
summary(negbinom)
## Fitting of the distribution ' nbinom ' by maximum likelihood
## Parameters :
## estimate Std. Error
## size 0.3214267 0.002736276
## mu 1401.5124212 21.297744213
## Loglikelihood: -133454.8 AIC: 266913.6 BIC: 266929.1
## Correlation matrix:
## size mu
## size 1.000000e+00 7.759517e-05
## mu 7.759517e-05 1.000000e+00
quantile(negbinom)
## Estimated quantiles for each specified probability (non-censored data)
## p=0.1 p=0.2 p=0.3 p=0.4 p=0.5 p=0.6 p=0.7 p=0.8 p=0.9
## estimate 2 21 74 184 381 709 1250 2184 4098
plot(negbinom)
Based on the quantiles as calculated by the fitted negative binomial distribution, we can bin the genes of interest into the following three bins:
Highly expressed genes (p >0.7 or gene count > 1250):
c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc")
Medium expressioned genes (0.4<p<0.7)
c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3")
Lowly expressioned genes (p<0.4):
c("Dixdc1", "Bhlhb9") plus some randomly selected genes
"c(Sox17", "Xkr4", "A830018L16Rik") to get to a total of 5
genes.
n <- 1:25
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=15009,m1=22,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=15009,m1=22,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)
gene_powers2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, maxAveCount = 50000, distributionObject=distribution, m=15009,m1=22,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers2,function(x) sum(x>0.8))
gene_powers_keptfinal2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=15009,m1=22,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)
gene_powers3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=15009,m1=22,selectedGenes=c("Dixdc1", "Bhlhb9", "Sox17", "Xkr4", "A830018L16Rik"), storeProcess=TRUE)$power)
#num_genes <- sapply(gene_powers3,function(x) sum(x>0.8))
#gene_powers_keptfinal3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Dixdc1", "Bhlhb9"),storeProcess=TRUE)$power)
# Trex1:
musk_id <- which(found$Gene.name=='Trex1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
# Ifrd1:
ifrd1_id <- which(found$Gene.name=='Ifrd1')
ifrd1_power <- sapply(gene_powers, function(x) x[ifrd1_id])
total_power_high <- sapply(gene_powers, function(x) mean(x))
total_power_medium <- sapply(gene_powers2, function(x) mean(x))
total_power_low <- sapply(gene_powers3, function(x) mean(x))
total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))
# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=15009,m1=22))
# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,total_power_high,total_power_medium,total_power_low,gene_powers_random) %>% tidyr::gather("type","power", total_power_high,total_power_medium,total_power_low,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for Highly, Medium and Lowly Expressed genes and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("total_power_high","total_power_medium","total_power_low","gene_powers_random"),labels=c("Highly Expressed Genes", "Medium Expressed Genes","Lowly Expressed Genes","Random Genes")) + scale_x_continuous(breaks=seq(1,25,2)) + scale_y_continuous(breaks=seq(0,1,0.2))